Preparing the data

Read in datasets to be used:

The file is downloaded directly from the Data Store url, and the fs package is used to extract the data from a zip file format.

#If first time running script:

checkfile <- file_exists(here::here("data/statistical-gis-boundaries-london.zip"))

if (checkfile == TRUE){
  Londonwards<-fs::dir_info(here::here("data", 
                                 "statistical-gis-boundaries-london", 
                                 "ESRI")) %>%
  #$ means exact match
  dplyr::filter(str_detect(path, 
                           "London_Ward_CityMerged.shp$"))%>%
  dplyr::select(path)%>%
  dplyr::pull()%>%
  #read in the file in
  sf::st_read()
  
  cat(cyan("\nAlready loaded!\n"))
  
} else {
  download.file("https://data.london.gov.uk/download/statistical-gis-boundary-files-london/9ba8c833-6370-4b11-abdc-314aa020d5e0/statistical-gis-boundaries-london.zip", 
              destfile="data/statistical-gis-boundaries-london.zip")

listfiles<-dir_info(here::here("data")) %>%
  dplyr::filter(str_detect(path, ".zip")) %>%
  dplyr::select(path) %>%
  pull() %>%
  #print out the .gz file
  print() %>%
  as.character() %>%
  utils::unzip(exdir=here::here("data"))

Londonwards<-fs::dir_info(here::here("data", 
                                 "statistical-gis-boundaries-london", 
                                 "ESRI")) %>%
  #$ means exact match
  dplyr::filter(str_detect(path, 
                           "London_Ward_CityMerged.shp$"))%>%
  dplyr::select(path)%>%
  dplyr::pull()%>%
  #read in the file in
  sf::st_read()

 cat(cyan("\nDownloaded from London Data Store\n"))
}
## Reading layer `London_Ward_CityMerged' from data source 
##   `C:\Users\joepo\Documents\Uni\UCL CASA\CASA0005\gis_wk8\data\statistical-gis-boundaries-london\ESRI\London_Ward_CityMerged.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 625 features and 7 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: 503568.2 ymin: 155850.8 xmax: 561957.5 ymax: 200933.9
## Projected CRS: OSGB36 / British National Grid
## 
## Already loaded!
# Read in attribute data
LondonWardProfiles <- read_csv(here::here("data","ward-profiles-excel-version.csv"),
                               #Identify missing values in the source file
                               na = c("", "NA", "n/a"), 
                               col_names = TRUE, 
                               locale = locale(encoding = 'Latin1')) %>%
  clean_names()
## Rows: 660 Columns: 67
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): Ward name, Old code, New code
## dbl (64): Population - 2015, Children aged 0-15 - 2015, Working-age (16-64) ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# spec(LondonWardProfiles)


# Read in schools point data
ldn_schools <- read_csv(here::here("data", "all_schools_xy_2016.csv"),
                        col_names = TRUE,
                        locale = locale(encoding = 'Latin1')) %>%
  # Convert to spatial object
  st_as_sf(., coords = c("x","y"),
           crs = 4326) %>%
  #Filter to secondary schools
  filter(PHASE == "Secondary")
## Rows: 3889 Columns: 24
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): SCHOOL_NAM, TYPE, PHASE, ADDRESS, TOWN, POSTCODE, STATUS, GENDER, ...
## dbl  (7): URN, EASTING, NORTHING, map_icon_l, Primary, x, y
## num  (1): OBJECTID
## lgl  (2): NEW_URN, OLD_URN
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Check the data

Plot a map of the shapefile data:

qtm(Londonwards)

qtm(ldn_schools)

# Check CRS for each geofile
check_crs <- append(st_crs(Londonwards), st_crs(ldn_schools))
check_crs[1]
## $input
## [1] "OSGB36 / British National Grid"
check_crs[3]
## $input
## [1] "EPSG:4326"

View a sample of the attribute data:

head(LondonWardProfiles)
## # A tibble: 6 × 67
##   ward_name      old_c…¹ new_c…² popul…³ child…⁴ worki…⁵ older…⁶ perce…⁷ perce…⁸
##   <chr>          <chr>   <chr>     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 City of London 00AA    E09000…    8100     650    6250    1250     8      76.9
## 2 Barking and D… 00ABFX  E05000…   14750    3850   10150     750    26      69  
## 3 Barking and D… 00ABFY  E05000…   10600    2700    6800    1050    25.7    64.3
## 4 Barking and D… 00ABFZ  E05000…   12700    3200    8350    1100    25.4    65.9
## 5 Barking and D… 00ABGA  E05000…   10400    2550    6400    1450    24.3    61.5
## 6 Barking and D… 00ABGB  E05000…   10750    2150    7050    1550    20.1    65.7
## # … with 58 more variables: percent_all_older_people_aged_65_2015 <dbl>,
## #   mean_age_2013 <dbl>, median_age_2013 <dbl>, area_square_kilometres <dbl>,
## #   population_density_persons_per_sq_km_2013 <dbl>, percent_bame_2011 <dbl>,
## #   percent_not_born_in_uk_2011 <dbl>,
## #   percent_english_is_first_language_of_no_one_in_household_2011 <dbl>,
## #   general_fertility_rate_2013 <dbl>, male_life_expectancy_2009_13 <dbl>,
## #   female_life_expectancy_2009_13 <dbl>, …

Merge spatial and attribute data

Reproject spatial objects to the same CRS.

Merge the attribute data to the wards shapefile.

#Transform the CRS for school points to the British grid 
ldn_schools <- st_transform(ldn_schools, crs = 27700)

wards_merged <- Londonwards %>%
  left_join(LondonWardProfiles, by = c("GSS_CODE" = "new_code")) %>%
  #Select desired columns
  #Note that 'geometry' is sticky - i.e. does not need to be specified in select statement to be retained
  select(NAME, GSS_CODE, BOROUGH, 
         population_2015, population_density_persons_per_sq_km_2013,
         average_gcse_capped_point_scores_2014,
         unauthorised_absence_in_all_schools_percent_2013,
         median_house_price_2014)

Map the joined file to check the data

qtm(wards_merged, 
    fill = "average_gcse_capped_point_scores_2014",
    borders = NULL)

Regression Model

Start the investigation by fitting a linear regression model to the dependent and independent variables. The first step here is to check the distributions of the input variables.

dist_gcse <- ggplot(wards_merged,
                    aes(x = average_gcse_capped_point_scores_2014)) +
  geom_histogram(aes(y = ..density..),
                 binwidth = 5) +
  #Add density line
  geom_density(colour = "orange",
               linewidth = 1, adjust = 1)
dist_gcse
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.

dist_absence <- ggplot(wards_merged,
                    aes(x = unauthorised_absence_in_all_schools_percent_2013)) +
  geom_histogram(aes(y = ..density..),
                 binwidth = 0.1) +
  #Add density line
  geom_density(colour = "lightblue",
               linewidth = 1, adjust = 1)
dist_absence

Both variables are roughly normally distributed.

#Create a regression model on its own
model1 <- wards_merged %>%
  lm(average_gcse_capped_point_scores_2014 ~ unauthorised_absence_in_all_schools_percent_2013,
     data = .)

summary(model1)
## 
## Call:
## lm(formula = average_gcse_capped_point_scores_2014 ~ unauthorised_absence_in_all_schools_percent_2013, 
##     data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -59.753 -10.223  -1.063   8.547  61.842 
## 
## Coefficients:
##                                                  Estimate Std. Error t value
## (Intercept)                                       371.471      2.165   171.6
## unauthorised_absence_in_all_schools_percent_2013  -41.237      1.927   -21.4
##                                                  Pr(>|t|)    
## (Intercept)                                        <2e-16 ***
## unauthorised_absence_in_all_schools_percent_2013   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.39 on 624 degrees of freedom
## Multiple R-squared:  0.4233, Adjusted R-squared:  0.4224 
## F-statistic:   458 on 1 and 624 DF,  p-value: < 2.2e-16

The scatterplot below shows GCSE scores by school absences.

# Create a base scatter plot
q <- qplot(x = unauthorised_absence_in_all_schools_percent_2013, 
           y = average_gcse_capped_point_scores_2014, 
           data=wards_merged)
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
#Add a regression line
q + stat_smooth(method="lm", se=FALSE, size=1) + 
  #jitter argument used as the x-scale is rounded
  geom_jitter()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## `geom_smooth()` using formula = 'y ~ x'

The assumptions of a linear regression model must then be validated. These are:

  1. Is there a linear association between the variables?

  2. Are the errors independent?

  3. Are the errors normally distributed?

  4. Do the errors have equal variance?


To test these assumptions, the model first needs to be tidied using ‘broom’ and ‘tidypredict’ packages.

#Create a tibble storing the test statistics
broom::tidy(model1)
## # A tibble: 2 × 5
##   term                                          estim…¹ std.e…² stati…³  p.value
##   <chr>                                           <dbl>   <dbl>   <dbl>    <dbl>
## 1 (Intercept)                                     371.     2.16   172.  0       
## 2 unauthorised_absence_in_all_schools_percent_…   -41.2    1.93   -21.4 1.27e-76
## # … with abbreviated variable names ¹​estimate, ²​std.error, ³​statistic
#Creates a tibble of the extended regression results
glance(model1)
## # A tibble: 1 × 12
##   r.squared adj.r.squa…¹ sigma stati…²  p.value    df logLik   AIC   BIC devia…³
##       <dbl>        <dbl> <dbl>   <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>   <dbl>
## 1     0.423        0.422  16.4    458. 1.27e-76     1 -2638. 5282. 5296. 167699.
## # … with 2 more variables: df.residual <int>, nobs <int>, and abbreviated
## #   variable names ¹​adj.r.squared, ²​statistic, ³​deviance
#Use tidypredict to add a column of expected result given the model
df_model1 <- wards_merged %>% tidypredict_to_column(model1)

Assumption 1: Linear association

Plot a scatterplot to visualise the relationship. See the scatterplot above for confirmation of this assumption.

Transforming variables for regression
If variables are not normally distributed, they will often need to be transformed in some way to make them more appropriate for linear regression. A useful method for this is Tukey’s Ladder, which tests different levels of power relationships to see which might have the most normal outcomes.

symbox(~median_house_price_2014,
       wards_merged,
       na.rm= TRUE,
       powers = seq(-3, 3, by=0.5))  #vector of powers to which 'x' is raised (here specified as a sequence)

The Tukey’s ladder suggests that a power relationship of x raised to the power of -1 will have the closest approximation to a normal distribution. To visualise this in a scatterplot:

qplot(x = (median_house_price_2014)^-1, 
      y = average_gcse_capped_point_scores_2014,
      data=wards_merged)

Assumption 2: Normally distributed errors

Use the ‘augment’ function to store the fitted values and residuals calculated from the model, to plot a histogram of the residuals. From the plot below, we can be satisfied that assumption 2 has been met.

df_model1 <- augment(model1, wards_merged)

#Plot histogram of the residuals
df_model1 %>% select(., .resid) %>%
  pull() %>%
  qplot() +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Assumption 3: Independent errors & errors with equal variance

The best way to quickly assess whether assumption 3 has been met, is to produce a scatter plot of residuals versus fitted values.

#Quick plot
# qplot(x = .fitted, 
#       y = .resid,
#       data=df_model1)

#create using ggplot (better practice)
ggplot(data = df_model1,
       aes(x = .fitted, y = .resid)) +
  geom_point() + 
  geom_hline(yintercept = 0,
             color = "blue") +
  labs(title = "Scatter plot of GCSE score fitted values versus residuals")

How to interpret this? If residuals (errors) are independent, we would expect to see:

  • The residuals form a roughly horizontal band around the 0 line. This suggests the variance of the errors are equal.

  • The residuals “bounce randomly” around the 0 line. This suggests that the errors are independent.

  • No one residual “stands out” from the basic random pattern of residuals. This suggests that there are no outliers.


  • Attribute data: London wards atlas

  • TEST DOT POINT
    Plotting a linear regression model in R will quickly produce a suite of plots that can help when assessing assumptions (including the residuals vs fit plot shown above).

par(mfrow=c(2,2))  #plot in a 2 by 2 array
plot(model1)

An additional method is to test independence of errors (the opposite of ‘autocorrelation’) through a Durbin-Watson test. The value should lie between 1 and 3, for least concern.

#run durbin-watson test
DW <- durbinWatsonTest(model1)
tidy(DW)
## # A tibble: 1 × 5
##   statistic p.value autocorrelation method             alternative
##       <dbl>   <dbl>           <dbl> <chr>              <chr>      
## 1      1.58       0           0.209 Durbin-Watson Test two.sided

Multiple Regression

To advance the analysis, additional independent variables can be added to the regression model. In this case, median house price has been added to Model 1. Note that the transformed values for median house price will be used (as tested in Tukey’s ladder above).
Additional note: unable to successfully run model raising to power of -1, so log transformed values used instead

model2 <- lm(average_gcse_capped_point_scores_2014 ~ 
               unauthorised_absence_in_all_schools_percent_2013 + log(median_house_price_2014),
             data = wards_merged)

tidy(model2)
## # A tibble: 3 × 5
##   term                                          estim…¹ std.e…² stati…³  p.value
##   <chr>                                           <dbl>   <dbl>   <dbl>    <dbl>
## 1 (Intercept)                                     202.    20.1    10.0  4.79e-22
## 2 unauthorised_absence_in_all_schools_percent_…   -36.2    1.92  -18.9  3.71e-63
## 3 log(median_house_price_2014)                     12.8    1.50    8.50 1.37e-16
## # … with abbreviated variable names ¹​estimate, ²​std.error, ³​statistic
glance(model2)
## # A tibble: 1 × 12
##   r.squared adj.r.squa…¹ sigma stati…²  p.value    df logLik   AIC   BIC devia…³
##       <dbl>        <dbl> <dbl>   <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>   <dbl>
## 1     0.483        0.482  15.5    291. 4.78e-90     2 -2604. 5215. 5233. 150261.
## # … with 2 more variables: df.residual <int>, nobs <int>, and abbreviated
## #   variable names ¹​adj.r.squared, ²​statistic, ³​deviance
df_model2 <- augment(model2, wards_merged)

Multiple regression assumptions: No multicollinearity

To check for multicollinearity among variables, need to calculate the correlation coefficient.

Correlation <- wards_merged %>%
  st_drop_geometry() %>%  #convert from spatial to regular df
  select(average_gcse_capped_point_scores_2014, unauthorised_absence_in_all_schools_percent_2013, median_house_price_2014) %>%
  mutate(ln_median_house_price_2014 = log(median_house_price_2014)) %>%  #Create log-transformed column of median house price
  # select(-median_house_price_2014) %>%  #Remove the raw column from dataset
  correlate()
## Correlation computed with
## • Method: 'pearson'
## • Missing treated using: 'pairwise.complete.obs'
Correlation
## # A tibble: 4 × 5
##   term                                           avera…¹ unaut…² media…³ ln_me…⁴
##   <chr>                                            <dbl>   <dbl>   <dbl>   <dbl>
## 1 average_gcse_capped_point_scores_2014           NA      -0.651   0.348   0.434
## 2 unauthorised_absence_in_all_schools_percent_2…  -0.651  NA      -0.217  -0.308
## 3 median_house_price_2014                          0.348  -0.217  NA       0.916
## 4 ln_median_house_price_2014                       0.434  -0.308   0.916  NA    
## # … with abbreviated variable names ¹​average_gcse_capped_point_scores_2014,
## #   ²​unauthorised_absence_in_all_schools_percent_2013,
## #   ³​median_house_price_2014, ⁴​ln_median_house_price_2014
#Visualise the correlation matrix
# rplot(Correlation %>% 
#         focus(-average_gcse_capped_point_scores_2014, 
#               -median_house_price_2014,
#               mirror = TRUE)
#       )

Variance Inflation Factor (VIF)

The VIF is another way to assess multicollinearity, and can be used as a method to reduce unnecessary predictors. A VIF above 10 is considered problematic.

vif(model2)
## unauthorised_absence_in_all_schools_percent_2013 
##                                         1.105044 
##                     log(median_house_price_2014) 
##                                         1.105044

Checking assumptions

par(mfrow=c(2,2))  #plot in a 2 by 2 array
plot(model2)

Spatial Autocorrelation

This dataset has a spatial component, and therefore a traditional Durbin-Watson test is not particularly appropriate. Instead, need to test for spatial autocorrelation. The first step in this approach is to plot the residuals on a map.

#Convert df_model2 to spatial object
sf_model2 <- df_model2 %>%
  st_as_sf(.)

tmap_mode("view")
## tmap mode set to interactive viewing
#Create basemap of wards
tm_shape(sf_model2) +
  #Fill polygons with residual value
  tm_polygons(".resid",
              palette = "RdYlBu") +
  #Add points from London schools data
  tm_shape(ldn_schools) +
  tm_dots(col = "TYPE")
## Variable(s) ".resid" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.

Test statistics - Moran’s I

The Moran’s I statistic is used to identify the presence of spatial autocorrelation in a dataset. To calculate Moran’s I, need to create a spatial weights matrix based off the neighbours of each polygon. THerefore, the first step is to calculate the centroid of each polygon to use in a nearest-neighbours calculation.

#Calculate centroids
wards_coords <- wards_merged %>%
  st_centroid() %>%
  st_geometry()
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over
## geometries of x
#Create binary matrix of neighbours
wards_nb <- wards_merged %>%
  poly2nb(queen = TRUE)

plot(wards_nb,
     st_geometry(wards_coords),
     col = "blue")

Can also calculate neighbours using the k nearest neighbours method (based off distance from centroid)

wards_nbknn <- wards_coords %>%
  knearneigh(k=4) %>%
  knn2nb
## Warning in knearneigh(., k = 4): knearneigh: identical points found
## Warning in knearneigh(., k = 4): knearneigh: kd_tree not available for identical
## points
plot(wards_nbknn, 
     st_geometry(wards_coords),
     col="green")

Use the neighbours matrix to calculate spatial weights.

swm_queens <- wards_nb %>% nb2listw(style = "W")  #'W' style indicates a row standardised matrix

swm_knn <- wards_nbknn %>% nb2listw(style = "W")

Finally, use the spatial weights matrix to calculate the Moran’s I test statistic.

moransI_queens <- df_model2 %>%
  st_drop_geometry() %>%
  select(.resid) %>%
  pull() %>%
  moran.test(., swm_queens) %>%
  tidy()

moransI_queens
## # A tibble: 1 × 7
##   estimate1 estimate2 estimate3 statistic  p.value method                alter…¹
##       <dbl>     <dbl>     <dbl>     <dbl>    <dbl> <chr>                 <chr>  
## 1     0.282   -0.0016  0.000556      12.0 1.54e-33 Moran I test under r… greater
## # … with abbreviated variable name ¹​alternative
moransI_knn <- df_model2 %>%
  st_drop_geometry() %>%
  select(.resid) %>%
  pull() %>%
  moran.test(., swm_knn) %>%
  tidy()

moransI_knn
## # A tibble: 1 × 7
##   estimate1 estimate2 estimate3 statistic  p.value method                alter…¹
##       <dbl>     <dbl>     <dbl>     <dbl>    <dbl> <chr>                 <chr>  
## 1     0.292   -0.0016  0.000718      10.9 3.78e-28 Moran I test under r… greater
## # … with abbreviated variable name ¹​alternative

For both our neighbours models, the Moran’s I falls between (0.28,0.30). The Moran’s I ranges from (-1,1), indicating that we are observing some weak positive correlation (clustering). /

Spatial Regression